Test the BP implementation against the skbio.TreeNode using real world trees provided by Greengenes 13_8. The following conditions are checked:

  • the topology, names, and lengths on a direct parse from newick are identical in preorder traversal
  • a serialization back to newick is identical by string comparison
  • that TreeNode.to_array is reproduced minimally for unifrac
  • that shear/collapse operations result in equivalent trees

In [1]:
%load_ext autoreload
%autoreload 2
from bp import parse_newick, to_skbio_treenode, to_skbio_treearray
from skbio import TreeNode
import numpy as np
import glob
from random import shuffle

In [2]:
def _correct_gg_reroot_length_issue(t):
    # the greengenes trees on reroot had a node with a length set to None
    # find and correct if it exists
    try:
        gg_reroot_none_node = t.find('k__Bacteria')
        gg_reroot_none_node.length = 0.0
    except:
        pass

    return t

def preorder_names(fp):
    """Find any preorder node name inconsistencies as a proxy for topology testing"""
    skt = _correct_gg_reroot_length_issue(TreeNode.read(fp))
    bpt = parse_newick(open(fp).read())
    
    for sk_node, k in zip(skt.preorder(include_self=True), range(bpt.B.sum())):
        bp_idx = bpt.preorderselect(k)
        if (sk_node.name != bpt.name(bp_idx)) or (sk_node.length != bpt.length(bp_idx)):
            # bpt right now uses 0.0 for root length
            if sk_node.is_root() and bp_idx == bpt.root():
                continue
            else:
                return sk_node, bp_idx
    
    return None, None

def newick_comparison(fp):
    """Verify newick output is consistent
    
    Note: the BP tree is converted to TreeNode
    """
    tn = str(_correct_gg_reroot_length_issue(TreeNode.read(fp)))
    bp = str(to_skbio_treenode(parse_newick(open(fp).read())))
    
    for i in range(len(tn)):
        if tn[i] != bp[i]:
            return (tn[i-25:i+25], bp[i-25:i+25])
    return None, None
            
def check_to_array(fp):
    skt = _correct_gg_reroot_length_issue(TreeNode.read(fp)).to_array(nan_length_value=0.0)
    bpt = to_skbio_treearray(parse_newick(open(fp).read()))
    
    if list(skt['id_index'].keys()) != list(bpt['id_index'].keys()):
        return 'id_index keys are not equal'
    
    for k in skt['id_index']:
        if skt['id_index'][k].is_tip() != bpt['id_index'][k].is_tip():
            return "id index tip identification is not equal"
    
    if not np.allclose(skt['child_index'], bpt['child_index']):
        return 'child_index is not equal'
    
    if not np.allclose(skt['length'], bpt['length']):
        return 'length is not equal'
    
    return None
    
def check_shear(fp):
    """Verify a random shear/collapse is comparable
    
    Note: skbio.TreeNode can alter the order of children in the tree. This does not
    represent a change in topology. Because of this, we are testing node subsets
    which are invariant to child order. 
    """
    skt = _correct_gg_reroot_length_issue(TreeNode.read(fp))
    bpt = parse_newick(open(fp).read())
    
    # determine which tips to keep
    names = [n.name for n in skt.tips()]
    shuffle(names)
    to_keep = int(np.ceil(len(names) * 0.1))
    names_to_keep = set(names[:to_keep])
    
    # shear the treenode
    skt_shear = skt.shear(names_to_keep) 
    bpt_shear = bpt.shear(names_to_keep).collapse()
    
    res = skt_shear.subsets() == to_skbio_treenode(bpt_shear).subsets()
    if res:
        return None
    else:
        return "shear/collapse is not equivalent"

In [3]:
problems = {}
for f in glob.glob('../../../greengenes_release/gg_13_8_otus/trees/*_otus.tree'):
    obs = {}
    key = f.rsplit('/')[-1]
    print(key)
    
    obs['preorder_names'] = preorder_names(f)
    obs['newick_comparison'] = newick_comparison(f)
    obs['check_to_array'] = check_to_array(f)
    obs['shear/collapse'] = check_shear(f)
    problems[key] = obs
problems


61_otus.tree
64_otus.tree
67_otus.tree
70_otus.tree
73_otus.tree
76_otus.tree
79_otus.tree
82_otus.tree
85_otus.tree
88_otus.tree
91_otus.tree
94_otus.tree
97_otus.tree
99_otus.tree
Out[3]:
{'61_otus.tree': {'check_to_array': None,
  'newick_comparison': (None, None),
  'preorder_names': (None, None),
  'shear/collapse': None},
 '64_otus.tree': {'check_to_array': None,
  'newick_comparison': (None, None),
  'preorder_names': (None, None),
  'shear/collapse': None},
 '67_otus.tree': {'check_to_array': None,
  'newick_comparison': (None, None),
  'preorder_names': (None, None),
  'shear/collapse': None},
 '70_otus.tree': {'check_to_array': None,
  'newick_comparison': (None, None),
  'preorder_names': (None, None),
  'shear/collapse': None},
 '73_otus.tree': {'check_to_array': None,
  'newick_comparison': (None, None),
  'preorder_names': (None, None),
  'shear/collapse': None},
 '76_otus.tree': {'check_to_array': None,
  'newick_comparison': (None, None),
  'preorder_names': (None, None),
  'shear/collapse': None},
 '79_otus.tree': {'check_to_array': None,
  'newick_comparison': (None, None),
  'preorder_names': (None, None),
  'shear/collapse': None},
 '82_otus.tree': {'check_to_array': None,
  'newick_comparison': (None, None),
  'preorder_names': (None, None),
  'shear/collapse': None},
 '85_otus.tree': {'check_to_array': None,
  'newick_comparison': (None, None),
  'preorder_names': (None, None),
  'shear/collapse': None},
 '88_otus.tree': {'check_to_array': None,
  'newick_comparison': (None, None),
  'preorder_names': (None, None),
  'shear/collapse': None},
 '91_otus.tree': {'check_to_array': None,
  'newick_comparison': (None, None),
  'preorder_names': (None, None),
  'shear/collapse': None},
 '94_otus.tree': {'check_to_array': None,
  'newick_comparison': (None, None),
  'preorder_names': (None, None),
  'shear/collapse': None},
 '97_otus.tree': {'check_to_array': None,
  'newick_comparison': (None, None),
  'preorder_names': (None, None),
  'shear/collapse': None},
 '99_otus.tree': {'check_to_array': None,
  'newick_comparison': (None, None),
  'preorder_names': (None, None),
  'shear/collapse': None}}

In [ ]: